
********** This program generates results shown in Table A-2, A-3, and A-4 of the appendix **********
***** The program uses word tables generated in R as input *****

/*
In do-file "generate_text_data" we read the YouGov dataset, keep the relevant 
open ended text variable, and save a txt file with the information in the variable.

In R program "word_count" we read the txt file, clean the text data, make a word 
count and construct a word table dataset. 

In this do-file, we use the most popular words (from the R word table dataset) 
to analyze use of these words at the individual respondent level.
*/

clear all
set more off

cd "$localdir/Data"
gl output "$localdir/Output"


***** Generate words data and statistics using word counts from R in pooled (all MSAs) sample *****

*** Find most popular words based on word count table from R
* Look at words mentioned at least 100 times in pooled sample, pick 20 most mentioned nouns

* Load word count pooled data
use pooled_wordcloud_q63, replace

gsort -freq
*br

* 20 most popular nouns 
/*
crime
job
housing
people
education
lack
poverty
homeless
government
transport
violence
city
public
unemployment
traffic
issue
living
tax
drug
cost
*/



*** Indicators for whether respondent mentions a word at least once 

* Load survey data
use msa_survey_indiv, clear

keep major_issues_msa_text caseid msa
* Check strings
g count=_n
fre major_issues_msa_text if count==161, width(100)
fre major_issues_msa_text if count==1094, width(100)
drop count
* Note: some of the words contain weird symbols (for instance "Don�t"), as the "'" symbol is not transferred correctly to Stata. 
* However, this does not matter for the word counts (as none of the words we look at have "'" in them).
* This was fixed for the text files used in R to determine most used words.

gen temp=strpos(major_issues_msa_text, "crime") | strpos(major_issues_msa_text, "Crime")
gen crime=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "job") | strpos(major_issues_msa_text, "Job")
gen job=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "housing") | strpos(major_issues_msa_text, "Housing")
gen housing=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "people") | strpos(major_issues_msa_text, "People")
gen people=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "education") | strpos(major_issues_msa_text, "Education")
gen education=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "lack") | strpos(major_issues_msa_text, "Lack")
gen lack=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "poverty") | strpos(major_issues_msa_text, "Poverty")
gen poverty=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "homeless") | strpos(major_issues_msa_text, "Homeless")
gen homeless=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "government") | strpos(major_issues_msa_text, "Government")
gen government=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "transport") | strpos(major_issues_msa_text, "Transport")
gen transport=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "violence") | strpos(major_issues_msa_text, "Violence")
gen violence=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "city") | strpos(major_issues_msa_text, "City")
gen city=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "public") | strpos(major_issues_msa_text, "Public")
gen public=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "unemployment") | strpos(major_issues_msa_text, "Unemployment")
gen unemployment=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "traffic") | strpos(major_issues_msa_text, "Traffic")
gen traffic=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "issue") | strpos(major_issues_msa_text, "Issue")
gen issue=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "living") | strpos(major_issues_msa_text, "Living")
gen living=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "tax") | strpos(major_issues_msa_text, "Tax")
gen tax=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "drug") | strpos(major_issues_msa_text, "Drug")
gen drug=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "cost") | strpos(major_issues_msa_text, "Cost")
gen cost=(temp!=0)
drop temp

* Save data with individual indicators for mentioning a word
save single_words, replace



*** Indicators for whether respondent mentions a word from a word category at least once 

* Load survey data
use msa_survey_indiv, clear

* Construct categories based on words that appear at least 100 times in pooled sample
* Categories:
* Crime: crime, violence, drugs, police, safety
* Economy and employment: job, work, income, money, wage, unemployment, employment, economy, economic, growth, paying
* Housing: housing, affordable, living, price
* Education: education, schools, 
* Poverty and social issues: poverty, poor, inequality, health, homeless
* Traffic and transport: infrastructure, transport, traffic, 
* Government and politics: government, public, city, taxes, area, system, local, people, issues, state
* Race: racial, racism, race

keep major_issues_msa_text caseid msa

gen temp=strpos(major_issues_msa_text, "crime") | strpos(major_issues_msa_text, "Crime") | ///
strpos(major_issues_msa_text, "violence") | strpos(major_issues_msa_text, "Violence") | ///
strpos(major_issues_msa_text, "drug") | strpos(major_issues_msa_text, "Drug") | ///
strpos(major_issues_msa_text, "police") | strpos(major_issues_msa_text, "Police") | ///
strpos(major_issues_msa_text, "safety") | strpos(major_issues_msa_text, "Safety")
gen crime_cat=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "job") | strpos(major_issues_msa_text, "Job") | ///
strpos(major_issues_msa_text, "work") | strpos(major_issues_msa_text, "Work") | ///
strpos(major_issues_msa_text, "income") | strpos(major_issues_msa_text, "Income") | ///
strpos(major_issues_msa_text, "money") | strpos(major_issues_msa_text, "Money") | ///
strpos(major_issues_msa_text, "wage") | strpos(major_issues_msa_text, "Wage") | ///
strpos(major_issues_msa_text, "unemployment") | strpos(major_issues_msa_text, "Unemployment") | ///
strpos(major_issues_msa_text, "employment") | strpos(major_issues_msa_text, "Employment") | ///
strpos(major_issues_msa_text, "economy") | strpos(major_issues_msa_text, "Economy") | ///
strpos(major_issues_msa_text, "economic") | strpos(major_issues_msa_text, "Economic") | ///
strpos(major_issues_msa_text, "growth") | strpos(major_issues_msa_text, "Growth") | ///
strpos(major_issues_msa_text, "paying") | strpos(major_issues_msa_text, "Paying")
gen economy_cat=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "housing") | strpos(major_issues_msa_text, "Housing") | ///
strpos(major_issues_msa_text, "affordable") | strpos(major_issues_msa_text, "Affordable") | ///
strpos(major_issues_msa_text, "living") | strpos(major_issues_msa_text, "Living") | ///
strpos(major_issues_msa_text, "price") | strpos(major_issues_msa_text, "Price") 
gen housing_cat=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "education") | strpos(major_issues_msa_text, "Edcuation") | ///
strpos(major_issues_msa_text, "school") | strpos(major_issues_msa_text, "School") 
gen education_cat=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "poverty") | strpos(major_issues_msa_text, "Poverty") | ///
strpos(major_issues_msa_text, "poor") | strpos(major_issues_msa_text, "Poor") | ///
strpos(major_issues_msa_text, "inequality") | strpos(major_issues_msa_text, "Inequality") | ///
strpos(major_issues_msa_text, "health") | strpos(major_issues_msa_text, "Health") | ///
strpos(major_issues_msa_text, "homeless") | strpos(major_issues_msa_text, "Homeless")
gen social_cat=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "infrastructure") | strpos(major_issues_msa_text, "Infrastructure") | ///
strpos(major_issues_msa_text, "transport") | strpos(major_issues_msa_text, "Transport") | ///
strpos(major_issues_msa_text, "traffic") | strpos(major_issues_msa_text, "Traffic") 
gen transport_cat=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "government") | strpos(major_issues_msa_text, "Government") | ///
strpos(major_issues_msa_text, "public") | strpos(major_issues_msa_text, "Public") | ///
strpos(major_issues_msa_text, "city") | strpos(major_issues_msa_text, "City") | ///
strpos(major_issues_msa_text, "tax") | strpos(major_issues_msa_text, "Tax") | ///
strpos(major_issues_msa_text, "area") | strpos(major_issues_msa_text, "Area") | ///
strpos(major_issues_msa_text, "system") | strpos(major_issues_msa_text, "System") | ///
strpos(major_issues_msa_text, "local") | strpos(major_issues_msa_text, "Local") | ///
strpos(major_issues_msa_text, "people") | strpos(major_issues_msa_text, "People") | ///
strpos(major_issues_msa_text, "issue") | strpos(major_issues_msa_text, "Issue") | ///
strpos(major_issues_msa_text, "state") | strpos(major_issues_msa_text, "State") 
gen politics_cat=(temp!=0)
drop temp

gen temp=strpos(major_issues_msa_text, "racial") | strpos(major_issues_msa_text, "Racial") | ///
strpos(major_issues_msa_text, "racism") | strpos(major_issues_msa_text, "Racism") | ///
strpos(major_issues_msa_text, "race") | strpos(major_issues_msa_text, "Race") 
gen race_cat=(temp!=0)
drop temp

* Save data with individual indicators for mentioning at least one word from a category
save word_categories, replace




***** Word stats in pooled sample *****

* Word categories
use word_categories, clear

* Statistics shown in Table A-2 of the appendix
tabstat crime_cat economy_cat housing_cat education_cat social_cat transport_cat politics_cat race_cat, s(mean)

* Single words
use single_words, clear

* Statistics shown in Table A-3 of the appendix
tabstat crime job housing people education lack poverty homeless government transport violence city public unemployment traffic issue living tax drug cost, s(mean)




***** Word  stats by individual MSA *****

* Word categories
use word_categories, clear

* Statistics shown in Table A-4 of the appendix
bysort msa: tabstat economy_cat crime_cat politics_cat social_cat housing_cat education_cat transport_cat race_cat, s(mean)

* Single words
use single_words, clear

bysort msa: tabstat crime job housing people education lack poverty homeless government transport violence city public unemployment traffic issue living tax drug cost, s(mean)







